Binder 线程池溢出问题

阿豪 8/28/2023

# 线程池溢出情景分析

客户端发现服务端线程用完了的情景：

static void binder_transaction(struct binder_proc *proc,
			       struct binder_thread *thread,
			       struct binder_transaction_data *tr, int reply,
			       binder_size_t extra_buffers_size)
{
	//......
	if (reply) {
		//......
	} else if (!(t->flags & TF_ONE_WAY)) {
		BUG_ON(t->buffer->async_transaction != 0);
		binder_inner_proc_lock(proc);
		binder_enqueue_deferred_thread_work_ilocked(thread, tcomplete);
		//记录一些信息
		t->need_reply = 1;
		t->from_parent = thread->transaction_stack;
		thread->transaction_stack = t;
		binder_inner_proc_unlock(proc);
		//将t->work插入目标线程的todo队列中并唤醒目标进程
		if (!binder_proc_transaction(t, target_proc, target_thread)) {
			binder_inner_proc_lock(proc);
			binder_pop_transaction_ilocked(thread, t);
			binder_inner_proc_unlock(proc);
			goto err_dead_proc_or_thread;
		}
	} else {
		//......
	}
	if (target_thread)
		binder_thread_dec_tmpref(target_thread);
	binder_proc_dec_tmpref(target_proc);
	if (target_node)
		binder_dec_node_tmpref(target_node);
	/*
	 * write barrier to synchronize with initialization
	 * of log entry
	 */
	smp_wmb();
	WRITE_ONCE(e->debug_id_done, t_debug_id);
	return;

	//......
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42

binder_proc_transaction 的实现如下：

static bool binder_proc_transaction(struct binder_transaction *t,
				    struct binder_proc *proc,
				    struct binder_thread *thread)
{
	struct binder_node *node = t->buffer->target_node;
	struct binder_priority node_prio;
	bool oneway = !!(t->flags & TF_ONE_WAY);
	bool pending_async = false;

	BUG_ON(!node);
	binder_node_lock(node);
	node_prio.prio = node->min_priority;
	node_prio.sched_policy = node->sched_policy;

	if (oneway) {
		BUG_ON(thread);
		if (node->has_async_transaction) {
			pending_async = true;
		} else {
			node->has_async_transaction = true;
		}
	}

	binder_inner_proc_lock(proc);

	//如果目标进程死亡或者目标线程不为NULL且死亡
	if (proc->is_dead || (thread && thread->is_dead)) {
		binder_inner_proc_unlock(proc);
		binder_node_unlock(node);
		return false;
	}

	// thread 为空
	// pending_async false
	if (!thread && !pending_async) //走这
	//从 target_proc 的 waiting_threads 链表中选择第一个作为 target_thread
    //如果没用空闲的线程，这里会返回 NULL
		thread = binder_select_thread_ilocked(proc);

	if (thread) {
		binder_transaction_priority(thread->task, t, node_prio,
					    node->inherit_rt);
		//把 binder_transaction 插入到 target_thread 的 todo 链表中
		binder_enqueue_thread_work_ilocked(thread, &t->work);
	} else if (!pending_async) {//同步调用，走这里
        //将数据插入 proc->todo
		binder_enqueue_work_ilocked(&t->work, &proc->todo);
	} else {
		binder_enqueue_work_ilocked(&t->work, &node->async_todo);
	}

	if (!pending_async) //走这
		//唤醒远程线程
		binder_wakeup_thread_ilocked(proc, thread, !oneway /* sync */);

	binder_inner_proc_unlock(proc);
	binder_node_unlock(node);

	return true;
}

// 从 target_proc 的 waiting_threads 链表中选择第一个作为 target_thread
// 服务端所有线程都在运转
// 这里返回 null
static struct binder_thread *
binder_select_thread_ilocked(struct binder_proc *proc)
{
	struct binder_thread *thread;

	assert_spin_locked(&proc->inner_lock);
	thread = list_first_entry_or_null(&proc->waiting_threads,
					  struct binder_thread,
					  waiting_thread_node);

	if (thread)
		list_del_init(&thread->waiting_thread_node);

	return thread;
}


//把 binder_transaction 插入到 target_proc 的 todo 链表中
static void
binder_enqueue_work_ilocked(struct binder_work *work,
			   struct list_head *target_list)
{
	BUG_ON(target_list == NULL);
	BUG_ON(work->entry.next && !list_empty(&work->entry));
	list_add_tail(&work->entry, target_list);
}

//唤醒接收端
static void binder_wakeup_thread_ilocked(struct binder_proc *proc,
					 struct binder_thread *thread,
					 bool sync)
{
	assert_spin_locked(&proc->inner_lock);

	if (thread) { // thread 为空
		if (sync) 
			wake_up_interruptible_sync(&thread->wait);
		else
			wake_up_interruptible(&thread->wait);
		return;
	}
    //走这
	binder_wakeup_poll_threads_ilocked(proc, sync);
}


//从 binder_proc 的 threads 中挨个唤醒
static void binder_wakeup_poll_threads_ilocked(struct binder_proc *proc,
					       bool sync)
{
	struct rb_node *n;
	struct binder_thread *thread;

	for (n = rb_first(&proc->threads); n != NULL; n = rb_next(n)) {
		thread = rb_entry(n, struct binder_thread, rb_node);
		if (thread->looper & BINDER_LOOPER_STATE_POLL &&
		    binder_available_for_proc_work_ilocked(thread)) {
            //此时 thread 处于运行状态，并不会立即响应 wake_up
			if (sync)
                //唤醒目标线程
				//下一次进程/进程调度时，会执行目标线程
				wake_up_interruptible_sync(&thread->wait);
			else
				wake_up_interruptible(&thread->wait);
		}
	}
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131

服务端线程的响应：

//服务端线程忙完自己的工作，就会进入下一个循环，向驱动读数据
static int binder_thread_read(struct binder_proc *proc,
			      struct binder_thread *thread,
			      binder_uintptr_t binder_buffer, size_t size,
			      binder_size_t *consumed, int non_block)
{
	
    //......
retry:
	binder_inner_proc_lock(proc);
    //这里返回 true
	wait_for_proc_work = binder_available_for_proc_work_ilocked(thread);
	binder_inner_proc_unlock(proc);

	thread->looper |= BINDER_LOOPER_STATE_WAITING;

	//......

	if (non_block) {
		if (!binder_has_work(thread, wait_for_proc_work))
			ret = -EAGAIN;
	} else {
        //不会 wait ，接着往下执行
		ret = binder_wait_for_work(thread, wait_for_proc_work);
	}

	thread->looper &= ~BINDER_LOOPER_STATE_WAITING;

	if (ret)
		return ret;

    //......

    while (1) {
		uint32_t cmd;
		struct binder_transaction_data_secctx tr;
		struct binder_transaction_data *trd = &tr.transaction_data;
		struct binder_work *w = NULL;
		struct list_head *list = NULL;
		struct binder_transaction *t = NULL;
		struct binder_thread *t_from;
		size_t trsize = sizeof(*trd);

		binder_inner_proc_lock(proc);
		if (!binder_worklist_empty_ilocked(&thread->todo))
			list = &thread->todo;
		else if (!binder_worklist_empty_ilocked(&proc->todo) &&
			   wait_for_proc_work) //走这个分支
			list = &proc->todo; //拿到 binder_proc 中的 todo 链表
		else {
			binder_inner_proc_unlock(proc);

			/* no data added */
			if (ptr - buffer == 4 && !thread->looper_need_return)
				goto retry;
			break;
		}

		if (end - ptr < sizeof(tr) + 4) {
			binder_inner_proc_unlock(proc);
			break;
		}
		w = binder_dequeue_work_head_ilocked(list); //从头部挨个处理 binder_work
		if (binder_worklist_empty_ilocked(&thread->todo))
			thread->process_todo = false;
		
		switch (w->type) {
			//......
		}
		//......
	}
}

static bool binder_available_for_proc_work_ilocked(struct binder_thread *thread)
{   
	return !thread->transaction_stack &&
		binder_worklist_empty_ilocked(&thread->todo) &&
		(thread->looper & (BINDER_LOOPER_STATE_ENTERED |
				   BINDER_LOOPER_STATE_REGISTERED));
}

static int binder_wait_for_work(struct binder_thread *thread,
				bool do_proc_work)
{
	DEFINE_WAIT(wait);
	struct binder_proc *proc = thread->proc;
	int ret = 0;

	freezer_do_not_count();
	binder_inner_proc_lock(proc);
	for (;;) {
		prepare_to_wait(&thread->wait, &wait, TASK_INTERRUPTIBLE);
        //返回 true 直接 break
		if (binder_has_work_ilocked(thread, do_proc_work))
			break;
		if (do_proc_work)
			list_add(&thread->waiting_thread_node,
				 &proc->waiting_threads);
		binder_inner_proc_unlock(proc);
		schedule();
		binder_inner_proc_lock(proc);
		list_del_init(&thread->waiting_thread_node);
		if (signal_pending(current)) {
			ret = -ERESTARTSYS;
			break;
		}
	}
    //清除 wait，接着往下执行
	finish_wait(&thread->wait, &wait);
	binder_inner_proc_unlock(proc);
	freezer_count();

	return ret;
}

# 线程池溢出 Debug 技巧

binder 线程到达上限, 这个的情景是 app 向 service 发起请求的频率过高，service 端如果对所有的业务执行都加了锁的话，则会导致 service 端用于接收处理 binder 事件的线程全部卡住，当线程池（default 16个线程）耗尽之后，就无法再处理请求。如果这个时候 app 的主线程如果再调用该 serivce 提供的方法，就很容易出现 anr

我们可以通过 /sys/kernel/debug/binder 这个目录下的文件来查看所有进程的使用 binder 的状况来确定是否有线程溢出的情况出现:

一般是通过 transactions 这个文件来查看 binder 运行状况：

binder transactions:
proc 2890
context binder
  buffer 45136: 0000000000000000 size 4:0:0 delivered
proc 2197
context hwbinder
  buffer 29820: 0000000000000000 size 4:0:0 delivered

1
2
3
4
5
6
7

第一行是 proc 2890 表示进程的 pid，第二行表示 binder 类型，接下来就是 buffer，用于表示内核中的 binder_buffer，如果这个 buffer 在程序运行过程中越来越多，那么就有内存泄漏的可能存在，不过这种情况很少，应用程序都是通过 libbinder 这个库来使用 binder 驱动的，在一次 ipc 调用结束后，库本生会执行 binder_buffer 的清理操作，这种系统中的公用库一般都非常稳定，出 bug 的几率很小。

proc 1523
context hwbinder
  thread 1542: l 02 need_return 0 tr 0
    incoming transaction 126150: 0000000000000000 from 1744:1838 to 1523:1542 code 4 flags 10 pri 1:89 r1 node 234 size 44:0 data 0000000000000000
  buffer 126150: 0000000000000000 size 44:0:0 active

1
2
3
4
5
6

还有一种情况是 proc 下面会出现 thread，如果 thread 特别多，那么就可能存在线程溢出的情况。为了进一步确认是否已经占满，dump 该进程获得其墓碑文件，就基本能看出来线程的情况。

解决线程池耗尽的问题，一般是从 app 端去限制请求的频率，如果不能拿到 app 源码，我们也可以修改 Framework 源码来限制特定的 IPC 远程调用的访问频率。

# 参考资料

034.Binder 多线程情景分析 036.Binder 代理对象泄露问题分析

Android Framework

Choose mode